{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/sc/3mnt5tgs419_hk7s16gq61p80000gn/T/jieba.cache\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1/4:...预处理   语料 ...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading model cost 0.543 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 2/4:...收集 共现词线索 ...\n",
      "Step 3/4:...计算    互信息 ...\n",
      "Step 4/4:...保存    候选词 ...\n",
      "完成! 耗时 49.50996398925781 s\n"
     ]
    }
   ],
   "source": [
    "from cntext import SoPmi\n",
    "import os\n",
    "\n",
    "sopmier = SoPmi(cwd=os.getcwd(),\n",
    "                input_txt_file='data/sopmi_corpus.txt',  #原始数据，您的语料\n",
    "                seedword_txt_file='data/sopmi_seed_words.txt', #人工标注的初始种子词\n",
    "                )   \n",
    "\n",
    "sopmier.sopmi()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 1/4:...预处理    语料 ...\n",
      "Step 2/4:...训练   word2vec模型 ...\n",
      "Step 3/4:...准备 每个seed在word2vec模型中的相似候选词...\n",
      "Step 4/4 完成! 耗时 60 s\n",
      "\n",
      "\n",
      "\n",
      "Step 3/4:...准备 每个seed在word2vec模型中的相似候选词...\n",
      "Step 4/4 完成! 耗时 60 s\n",
      "\n",
      "\n",
      "\n",
      "Step 3/4:...准备 每个seed在word2vec模型中的相似候选词...\n",
      "Step 4/4 完成! 耗时 60 s\n",
      "\n",
      "\n",
      "\n",
      "Step 3/4:...准备 每个seed在word2vec模型中的相似候选词...\n",
      "Step 4/4 完成! 耗时 60 s\n",
      "\n",
      "\n",
      "\n",
      "Step 3/4:...准备 每个seed在word2vec模型中的相似候选词...\n",
      "Step 4/4 完成! 耗时 60 s\n",
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from cntext import W2VModels\n",
    "import os\n",
    "\n",
    "#初始化模型\n",
    "model = W2VModels(cwd=os.getcwd())  #语料数据 w2v_corpus.txt\n",
    "model.train(input_txt_file='data/w2v_corpus.txt')\n",
    "\n",
    "\n",
    "#根据种子词，筛选出没类词最相近的前100个词\n",
    "model.find(seedword_txt_file='data/w2v_seeds/integrity.txt', \n",
    "           topn=100)\n",
    "model.find(seedword_txt_file='data/w2v_seeds/innovation.txt', \n",
    "           topn=100)\n",
    "model.find(seedword_txt_file='data/w2v_seeds/quality.txt', \n",
    "           topn=100)\n",
    "model.find(seedword_txt_file='data/w2v_seeds/respect.txt', \n",
    "           topn=100)\n",
    "model.find(seedword_txt_file='data/w2v_seeds/teamwork.txt', \n",
    "           topn=100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 需要注意\n",
    "\n",
    "训练出的w2v模型可以后续中使用。\n",
    "\n",
    "```python\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "w2v_model = KeyedVectors.load(w2v.model路径)\n",
    "#找出最相近的词\n",
    "#w2v_model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])\n",
    "#w2v_model.most_similar_cosmul(positive=['女人', '国王'], negative=['男人'])\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>.</th>\n",
       "      <th>bus</th>\n",
       "      <th>by</th>\n",
       "      <th>day</th>\n",
       "      <th>every</th>\n",
       "      <th>go</th>\n",
       "      <th>i</th>\n",
       "      <th>night</th>\n",
       "      <th>school</th>\n",
       "      <th>theatre</th>\n",
       "      <th>to</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>.</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bus</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>by</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>day</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>every</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>go</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>i</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>night</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>school</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>theatre</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>to</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         .  bus  by  day  every  go  i  night  school  theatre  to\n",
       ".        0    1   1    0      0   0  0      0       0        0   0\n",
       "bus      1    0   2    1      0   0  0      1       0        0   0\n",
       "by       1    2   0    1      2   0  0      1       0        0   0\n",
       "day      0    1   1    0      1   0  0      0       1        0   0\n",
       "every    0    0   2    1      0   0  0      1       1        1   2\n",
       "go       0    0   0    0      0   0  2      0       1        1   2\n",
       "i        0    0   0    0      0   2  0      0       0        0   2\n",
       "night    0    1   1    0      1   0  0      0       0        1   0\n",
       "school   0    0   0    1      1   1  0      0       0        0   1\n",
       "theatre  0    0   0    0      1   1  0      1       0        0   1\n",
       "to       0    0   0    0      2   2  2      0       1        1   0"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from cntext import co_occurrence_matrix\n",
    "\n",
    "documents = [\"I go to school every day by bus .\",\n",
    "         \"i go to theatre every night by bus\"]\n",
    "\n",
    "co_occurrence_matrix(documents, window_size=2, lang='english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /var/folders/sc/3mnt5tgs419_hk7s16gq61p80000gn/T/jieba.cache\n",
      "Loading model cost 0.565 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Python</th>\n",
       "      <th>好玩</th>\n",
       "      <th>学</th>\n",
       "      <th>很</th>\n",
       "      <th>是</th>\n",
       "      <th>最好</th>\n",
       "      <th>的</th>\n",
       "      <th>编程</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Python</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>好玩</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>学</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>很</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>是</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>最好</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>的</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>编程</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        Python  好玩  学  很  是  最好  的  编程\n",
       "Python       0   0  0  0  1   1  0   0\n",
       "好玩           0   0  0  1  0   0  0   1\n",
       "学            0   0  0  0  1   1  1   1\n",
       "很            0   1  0  0  0   0  0   1\n",
       "是            1   0  1  0  0   1  0   0\n",
       "最好           1   0  1  0  1   0  1   0\n",
       "的            0   0  1  0  0   1  0   1\n",
       "编程           0   1  1  1  0   0  1   0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "documents2 = [\"编程很好玩\",\n",
    "             \"Python是最好学的编程\"]\n",
    "\n",
    "co_occurrence_matrix(documents2, window_size=2, lang='chinese')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
